MODEL_NAME=${MODEL_NAME:-Qwen3-30B-A3B-Instruct-AWQ}
TP=${TP:-1}
VISIBLE_DEVICES=${VISIBLE_DEVICES:-0}
PORT=${PORT:-3010}

TOPS_VISIBLE_DEVICES=${VISIBLE_DEVICES} python3 -m vllm.entrypoints.openai.api_server \
      --model /home/models/${MODEL_NAME} \
      --served-model-name ${MODEL_NAME} \
      --gpu-memory-utilization 0.9 \
      --max-model-len 8192 \
      --port ${PORT} \
      --block-size 64 \
      --dtype auto \
      --device gcu \
      --enable_prefix_caching \
      --tensor-parallel-size ${TP} \
      --quantization moe_wna16_gcu \
      --trust-remote-code

